# plotly standard imports
import plotly.graph_objs as go
import chart_studio.plotly as py
# Cufflinks wrapper on plotly
import cufflinks
# Data science imports
import pandas as pd
import numpy as np
# Options for pandas
pd.options.display.max_columns = 30
# Display all cell outputs
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'
from plotly.offline import iplot, init_notebook_mode
cufflinks.go_offline(connected=True)
init_notebook_mode(connected=True)
# Set global theme
cufflinks.set_config_file(world_readable=True, theme='pearl')
And get interesting features
from src.load_datasets import load_datasets
from src.prepare_datasets import feature_list
train, test = load_datasets()
train_features = train[feature_list]
test_features = test[feature_list]
train_features.index = pd.to_datetime(train.pop('timestamp'), unit='ms')
test_features.index = pd.to_datetime(test.pop('timestamp'), unit='ms')
train_features
| high | low | open | close | volume | |
|---|---|---|---|---|---|
| timestamp | |||||
| 2016-08-10 15:53:00 | 579.000000 | 579.00 | 579.00 | 579.000000 | 1.000000 |
| 2016-08-10 15:54:00 | 604.750000 | 592.96 | 592.96 | 604.750000 | 131.838200 |
| 2016-08-10 15:55:00 | 604.750000 | 600.00 | 604.75 | 600.000000 | 89.437926 |
| 2016-08-10 15:57:00 | 604.750000 | 600.00 | 600.00 | 604.750000 | 51.328200 |
| 2016-08-10 15:58:00 | 604.750000 | 604.75 | 604.75 | 604.750000 | 4.586500 |
| ... | ... | ... | ... | ... | ... |
| 2020-04-22 12:37:00 | 6946.000000 | 6945.00 | 6945.90 | 6946.000000 | 0.130838 |
| 2020-04-22 12:38:00 | 6948.700000 | 6946.00 | 6946.00 | 6948.684645 | 0.951414 |
| 2020-04-22 12:39:00 | 6951.300000 | 6947.10 | 6948.70 | 6950.100000 | 0.055320 |
| 2020-04-22 12:40:00 | 6952.628959 | 6951.30 | 6951.30 | 6952.628959 | 0.168686 |
| 2020-04-22 12:41:00 | 6954.692469 | 6953.90 | 6953.90 | 6954.600000 | 0.046001 |
1816296 rows × 5 columns
In theory we are going to use 4 features: The price itself and three extra technical indicators.
MACD (Trend) Stochastics (Momentum) Average True Range (Volume)
Exponential Moving Average: Is a type of infinite impulse response filter that applies weighting factors which decrease exponentially. The weighting for each older datum decreases exponentially, never reaching zero.
MACD: The Moving Average Convergence/Divergence oscillator (MACD) is one of the simplest and most effective momentum indicators available. The MACD turns two trend-following indicators, moving averages, into a momentum oscillator by subtracting the longer moving average from the shorter moving average.
Stochastics oscillator: The Stochastic Oscillator is a momentum indicator that shows the location of the close relative to the high-low range over a set number of periods.
Average True Range: Is an indicator to measure the volalitility (NOT price direction). The largest of:
from src.indicators import MACD, stochastics_oscillator, ATR
days_to_show = 60
items_to_show = days_to_show * 24 * 60
macd = MACD(train_features['close'][-items_to_show:], 12, 26, 9)
pd.DataFrame({'MACD': macd}).iplot()
stochastics = stochastics_oscillator(train_features['close'][-items_to_show:], 14)
pd.DataFrame({'Stochastics Oscillator': stochastics}).iplot()
atr = ATR(train_features.iloc[-items_to_show:], 14)
atr.head()
atr.iplot()
| 0 | |
|---|---|
| timestamp | |
| 2020-02-22 03:08:00 | 0.012297 |
| 2020-02-22 03:09:00 | 0.100000 |
| 2020-02-22 03:10:00 | 0.012297 |
| 2020-02-22 03:11:00 | 0.100000 |
| 2020-02-22 03:12:00 | 0.100000 |
import scipy.stats as stats
import pylab
close_change = train_features['close'].pct_change()[1:]
close_change.head()
stats.probplot(close_change, dist='norm', plot=pylab)
timestamp 2016-08-10 15:54:00 0.044473 2016-08-10 15:55:00 -0.007854 2016-08-10 15:57:00 0.007917 2016-08-10 15:58:00 0.000000 2016-08-10 15:59:00 0.000000 Name: close, dtype: float64
((array([-4.94453614, -4.76886449, -4.67400546, ..., 4.67400546,
4.76886449, 4.94453614]),
array([-0.17508113, -0.12714831, -0.0479099 , ..., 0.06530612,
0.06600338, 0.12148934])),
(0.00107797240830823, 2.207475798096329e-06, 0.8328053817597639))
import tensorflow as tf
import matplotlib.pyplot as plt
def plot_log_freaquency(series):
fft = tf.signal.rfft(series)
f_per_dataset = np.arange(0, len(fft))
n_samples_d = len(series)
days_per_year = 365
years_per_dataset = n_samples_d/(days_per_year)
f_per_year = f_per_dataset/years_per_dataset
plt.step(f_per_year, np.abs(fft))
plt.xscale('log')
plt.xticks([1, 365], labels=['1/Year', '1/day'])
_ = plt.xlabel('Frequency (log scale)')
plot_log_freaquency(train_features['close'])
plot_log_freaquency(train_features['close'].diff().dropna())
plot_log_freaquency(train_features['volume'])
plot_log_freaquency(train_features['volume'].diff().dropna())
import sweetviz as sv
compare_report = sv.compare([train_features, 'Train data'], [test_features, 'Test data'], "close")
compare_report.show_notebook()
train_features[59::60].iplot(subplots=True)
test_features[59::60].iplot(subplots=True)
Will use only training mean and deviation for not give NN access to test dataset
Divide by the max-min deviation
pd.set_option('float_format', '{:.2f}'.format)
train_features.describe()
| high | low | open | close | volume | |
|---|---|---|---|---|---|
| count | 1816296.00 | 1816296.00 | 1816296.00 | 1816296.00 | 1816296.00 |
| mean | 6145.04 | 6138.47 | 6141.76 | 6141.75 | 17.74 |
| std | 3544.95 | 3538.94 | 3541.95 | 3541.94 | 52.61 |
| min | 563.24 | 562.99 | 563.10 | 563.00 | 0.00 |
| 25% | 3588.77 | 3586.88 | 3587.70 | 3587.80 | 0.69 |
| 50% | 6497.00 | 6494.50 | 6495.63 | 6495.60 | 3.94 |
| 75% | 8481.90 | 8475.00 | 8479.08 | 8479.00 | 15.13 |
| max | 19891.00 | 19880.00 | 19890.00 | 19891.00 | 6717.52 |
test_features.describe()
| high | low | open | close | volume | |
|---|---|---|---|---|---|
| count | 454074.00 | 454074.00 | 454074.00 | 454074.00 | 454074.00 |
| mean | 18674.91 | 18652.66 | 18663.90 | 18663.96 | 5.55 |
| std | 12974.03 | 12947.02 | 12960.61 | 12960.68 | 23.19 |
| min | 6955.20 | 6952.80 | 6954.70 | 6955.04 | 0.00 |
| 25% | 9649.30 | 9645.11 | 9647.40 | 9647.35 | 0.18 |
| 50% | 11656.00 | 11651.75 | 11654.00 | 11654.00 | 0.97 |
| 75% | 23178.00 | 23150.00 | 23165.00 | 23163.99 | 3.73 |
| max | 58321.24 | 58304.00 | 58317.00 | 58317.00 | 2982.69 |
train_mean = train_features.mean()
train_max = train_features.max()
train_min = train_features.min()
train_std = train_features.std()
maximum for training to litle, and not will allow correctly predict values in testing dataset, will use manually choosed value for maximum 100 thouthands dollars except of volume
MAX_TARGET = 100000
train_max['high'] = MAX_TARGET
train_max['low'] = MAX_TARGET
train_max['open'] = MAX_TARGET
train_max['close'] = MAX_TARGET
train_d = train_max - train_min
train_normalised = train_features / train_d
test_normalised = test_features / train_d
train_normalised.head()
| high | low | open | close | volume | |
|---|---|---|---|---|---|
| timestamp | |||||
| 2016-08-10 15:53:00 | 0.01 | 0.01 | 0.01 | 0.01 | 0.00 |
| 2016-08-10 15:54:00 | 0.01 | 0.01 | 0.01 | 0.01 | 0.02 |
| 2016-08-10 15:55:00 | 0.01 | 0.01 | 0.01 | 0.01 | 0.01 |
| 2016-08-10 15:57:00 | 0.01 | 0.01 | 0.01 | 0.01 | 0.01 |
| 2016-08-10 15:58:00 | 0.01 | 0.01 | 0.01 | 0.01 | 0.00 |
train_normalised.index = train_features.index
train_normalised[59::60].iplot(subplots=True, title="Train")
test_normalised.index = test_features.index
test_normalised[59::60].iplot(subplots=True, title="Test")
train_in_hours = train_features[59::60]
feature2normaliesd = pd.DataFrame({
'Real': train_in_hours['close'],
'Normalised': train_normalised['close'][59::60]
})
feature2normaliesd.index = train_in_hours.index
feature2normaliesd.iplot(subplots=True)